#include "gb-include.h"

#include "HashTableX.h"
#include "Domains.h"
#include "Mem.h"

char *getDomainOfIp ( char *host , int32_t hostLen , int32_t *dlen ) {
	// get host length
	//int32_t hostLen = gbstrlen(host);
	// if ip != 0 then host is a numeric ip, point to first 3 #'s
	char *s = host + hostLen - 1;
	while ( s > host && *s!='.' ) s--;
	// if no '.' return NULL and 0
	if ( s == host ) { *dlen = 0; return NULL; }
	// otherwise, set length
	*dlen = s - host;
	// return the first 3 #'s (1.2.3) as the domain
	return host;
}


char *getDomain ( char *host , int32_t hostLen , char *tld , int32_t *dlen ) {
	// assume no domain 
	*dlen = 0;
	// get host length
	//int32_t hostLen = gbstrlen(host);
	// get the tld in host, if any, if not, it returns NULL
	char *s = tld; // getTLD ( host , hostLen );
	// return NULL if host contains no valid tld
	if ( ! s ) return NULL;
	// if s is host we just have tld
	if ( s == host ) return NULL;
	// there MUST be a period before s
	s--; if ( *s != '.' ) return NULL;
	// back up over the period
	s--;
	// now go back until s hits "host" or another period
	while ( s > host && *s !='.' ) s--;
	// . now *s=='.' or s==host
	// . if s is host then "host" is an acceptable domain w/o a hostname
	// . fix http://.xyz.com/...... by checking for period
	if ( s == host && *s !='.' ) { *dlen = hostLen; return s; }
	// skip s forward over the period to point to domain name
	s++;
	// set domain length
	*dlen = hostLen - ( s - host );
	return s;
}

// host must be NULL terminated
char *getTLD ( char *host , int32_t hostLen ) {
	// make "s" point to last period in the host
	//char *s = host + gbstrlen(host) - 1;
	char *hostEnd = host + hostLen;
	char *s       = hostEnd - 1;
	while ( s > host && *s !='.' ) s--;
	// point to the tld in question
	char *t  = s;
	if ( *t == '.' ) t++; 
	// reset our current tld ptr
	char *tld = NULL;
	// is t a valid tld? if so, set "tld" to "t".
	if ( isTLD ( t , hostEnd - t ) ) tld = t;
	// host had no period at most we had just a tld so return NULL
	if ( s == host ) return tld;

	// back up over last period
	s--;
	// just because it's in table doesn't mean we can't try going up more
	while ( s > host && *s !='.' ) s--;
	// point to the tld in question
	t  = s;
	if ( *t == '.' ) t++; 
	// is t a valid tld? if so, set "tld" to "t".
	if ( isTLD ( t , hostEnd - t ) ) tld = t;
	// host had no period at most we had just a tld so return NULL
	if ( s == host ) return tld;


	// . now only 1 tld has 2 period and that is "LKD.CO.IM"
	// . so waste another iteration for that (TODO: speed up?)
	// . back up over last period
	s--;
	// just because it's in table doesn't mean we can't try going up more
	while ( s > host && *s !='.' ) s--;
	// point to the tld in question
	t  = s;
	if ( *t == '.' ) t++; 
	// is t a valid tld? if so, set "tld" to "t".
	if ( isTLD ( t , hostEnd - t ) ) tld = t;
	// we must have gotten the tld by this point, if there was a valid one
	return tld;
}

//static TermTable  s_table(false);
static HashTableX s_table;
bool isTLD ( char *tld , int32_t tldLen ) {

	int32_t pcount = 0;
	// now they are random!
	for ( int32_t i = 0 ; i < tldLen ; i++ ) {
		// period count
		if ( tld[i] == '.' ) { pcount++; continue; }
		if ( ! is_alpha_a(tld[i]) ) return false;
	}

	if ( pcount == 0 ) return true;
	if ( pcount >= 2 ) return false;

	// otherwise, if one period, check table to see if qualified

	// we use this as our hashtable
	static bool       s_isInitialized = false;
	// . i shrunk this list a lot
	// . see backups for the hold list
	static char      *s_tlds[] = {
	"AB.CA",
	"AC",
	"AC.AE",
	"AC.AT",
	"AC.CN",
	"AC.CR",
	"AC.CY",
	"AC.FJ",
	"AC.GG",
	"AC.ID",
	"AC.IL",
	"AC.IM",
	"AC.IN",
	"AC.JE",
	"AC.JP",
	"AC.KR",
	"AC.NZ",
	"AC.PA",
	"AC.TH",
	"AC.UG",
	"AC.UK",
	"AC.YU",
	"AC.ZA",
	"AD",
	"AD.JP",
	"AE",
	"AERO",
	"AH.CN",
	"AI",
	"ALDERNEY.GG",
	"ALT.ZA",
	"AM",          // 10.am
	"ART.BR",
	"ART.DO",
	"ARTS.CO",
	"ARTS.VE",
	"ASN.AU",
	"ASN.LV",
	"AG",
	"AS",
	"AT",
	"AU",
	"AW",
	"AZ",
	"BA",
	"BB",
	"BBS.TR",
	"BC.CA",
	"BD",
	"BE",
	"BF",
	"BG",
	"BH",
	"BI",
	"BIB.VE",
	"BIZ",
	"BJ",
	"BJ.CN",
	"BM",
	"BN",
	"BO",
	"BR",
	"BS",
	"BT",
	"BV",
	"BW",
	"BY",
	"BZ",
	"CA",
	"CC",
	"CD",  // mdw
	"CF",
	"CG",
	"CH",
	"CI",
	"CK",
	"CL",
	"CM",
	"CN",
	"CO",
	"CO.AT",
	"CO.AO",
	"CO.CK",
	"CO.CR",
	"CO.GG",
	"CO.HU",
	"CO.ID",
	"CO.IL",
	"CO.IM",
	"CO.IN",
	"CO.JE",
	"CO.JP",
	"CO.KR",
	"COM",
	"COM.AR",
	"COM.AU",
	"COM.AZ",
	"COM.BB",
	"COM.BM",
	"COM.BR",
	"COM.BS",
	"COM.CN",
	"COM.CO",
	"COM.CU",
	"COM.CY",
	"COM.DO",
	"COM.EC",
	"COM.EG",
	"COM.FJ",
	"COM.GE",
	"COM.GU",
	"COM.HK",
	"COM.JO",
	"COM.KH",
	"COM.LA",
	"COM.LB",
	"COM.LC",
	"COM.LV",
	"COM.LY",
	"COM.MM",
	"COM.MO",
	"COM.MT",
	"COM.MX",
	"COM.MY",
	"COM.NA",
	"COM.NC",
	"COM.NI",
	"COM.NP",
	"COM.PA",
	"COM.PE",
	"COM.PH",
	"COM.PL",
	"COM.PY",
	"COM.RU",
	"COM.SG",
	"COM.SH",
	"COM.SY",
	"COM.TN",
	"COM.TR",
	"COM.TW",
	"COM.UA",
	"COM.UY",
	"COM.VE",
	"CONF.AU",
	"CONF.LV",
	"CO.NZ",
	"COOP",
	"CO.AE",
	"CO.SV",
	"CO.TH",
	"CO.UG",
	"CO.UK",
	"CO.VE",
	"CO.VI",
	"CO.YU",
	"CO.ZA",
	"CQ.CN",
	"CR",
	"CSIRO.AU",
	"CU",
	"CV",
	"CX",
	"CY",
	"CZ",
	"DE",
	"DJ",
	"DK",
	"DM",
	"DO",
	"DZ",
	"EC",
	"ED.CR",
	"EDU",
	"EDU.BM",
	"EDU.AR",
	"EDU.CN",
	"EDU.CO",
	"EDU.DO",
	"EDU.EC",
	"EDU.EG",
	"EDU.GE",
	"EDU.GU",
	"EDU.JO",
	"EDU.LC",
	"EDU.LV",
	"EDU.MM",
	"EDU.MO",
	"EDU.MY",
	"EDUNET.TN",
	"EDU.PA",
	"EDU.PY",
	"EDU.SG",
	"EDU.SH",
	"EDU.TR",
	"EDU.TW",
	"EDU.UY",
	"EDU.VE",
	"EDU.YU",
	"EDU.ZA",
	"EE",
	"EG",
	"EH",
	"ENS.TN",
	"ER",
	"ERNET.IN",
	"ES",
	"ESP.BR",
	"ET",
	"ETC.BR",
	"EU",
	"EUN.EG",
	"FI",
	"FI.CR",
	"FIN.EC",
	"FIN.TN",
	"FIRM.CO",
	"FIRM.VE",
	"FJ",
	"FK",
	"FM",
	"FO",
	"FR",
	"FX",
	"G12.BR",
	"GA",
	"GB",
	"GD",
	"GD.CN",
	"GE",
	"GEN.NZ",
	"GF",
	"GG",
	"GH",
	"GI",
	"GL",
	"GM",
	"GN",
	"GOB.PA",
	"GO.CR",
	"GO.ID",
	"GO.KR",
	"GO.TH",
	"GO.UG",
	"GOV",
	"GOV.AE",
	"GOV.AR",
	"GOV.AU",
	"GOV.BM",
	"GOV.BR",
	"GOV.CN",
	"GOV.CO",
	"GOV.CY",
	"GOV.DO",
	"GOV.EC",
	"GOV.EG",
	"GOVE.TW",
	"GOV.FJ",
	"GOV.GE",
	"GOV.GG",
	"GOV.GU",
	"GOV.IL",
	"GOV.IM",
	"GOV.IN",
	"GOV.JE",
	"GOV.JO",
	"GOV.JP",
	"GOV.LB",
	"GOV.LC",
	"GOV.LV",
	"GOV.MM",
	"GOV.MO",
	"GOV.MY",
	"GOV.SG",
	"GOV.SH",
	"GOV.TN",
	"GOVT.NZ",
	"GOV.TR",
	"GOV.UA",
	"GOV.UK",
	"GOV.VE",
	"GOV.ZA",
	"GP",
	"GQ",
	"GR",
	"GS",
	"GS.CN",
	"GT",
	"GU",
	"GUERNSEY.GG",
	"GW",
	"GX.CN",
	"GY",
	"GZ.CN",
	"HB.CN",
	"HE.CN",
	"HI.CN",
	"HK",
	"HK.CN",
	"HL.CN",
	"HM",
	"HN",
	"HN.CN",
	"HR",
	"HT",
	"HU",
	"ID",
	"ID.AU",
	"ID.FJ",
	"ID.LV",
	"IE",
	"IL",
	"IM",
	"IN",
	"IND.BR",
	"IND.GG",
	"IND.JE",
	"IND.TN",
	"INF.BR",
	"INFO",
	"INFO.AU",
	"INFO.CO",
	"INFO.HU",
	"INFO.TN",
	"INFO.VE",
	"INT",
	"INT.CO",
	"INTL.TN",
	"INT.VE",
	"IO",
	"IQ",
	"IR",
	"IS",
	"IT",
	"JE",
	"JERSEY.JE",
	"JL.CN",
	"JM",
	"JO",
	"JP",
	"JS.CN",
	"K12.EC",
	"K12.IL",
	"K12.TR",
	"KE",
	"KG",
	"KH",
	"KI",
	"KIDS",
	"KM",
	"KN",
	"KP",
	"KR",
	"KW",
	"KY",
	"KZ",
	"LA",
	"LB",
	"LC",
	"LI",
	"LK",
	"LKD.CO.IM",
	"LN.CN",
	"LR",
	"LS",
	"LT",
	"LTD.GG",
	"LTD.JE",
	"LTD.UK",
	"LU",
	"LV",
	"LY",
	"MA",
	"MB.CA",
	"MC",
	"MD",
	"ME",
	"MED.EC",
	"MG",
	"MH",
	"MIL",
	"MIL.BR",
	"MIL.CO",
	"MIL.DO",
	"MIL.EC",
	"MIL.GE",
	"MIL.GU",
	"MIL.ID",
	"MIL.LB",
	"MIL.LV",
	"MIL.PH",
	"MIL.SH",
	"MIL.TR",
	"MIL.VE",
	"MIL.ZA",
	"MK",
	"ML",
	"MM",
	"MN",
	"MO",
	"MO.CN",
	"MOD.UK",
	"MP",
	"MQ",
	"MR",
	"MS",
	"MT",
	"MU",
	"MUNI.IL",
	"MUSEUM",
	"MV",
	"MW",
	"MX",
	"MY",
	"MZ",
	"NA",
	"NAME",
	"NAT.TN",
	"NB.CA",
	"NC",
	"NE",
	"NET",
	"NET.AR",
	"NET.AU",
	"NET.AZ",
	"NET.BB",
	"NET.BM",
	"NET.BR",
	"NET.BS",
	"NET.CN",
	"NET.CU",
	"NET.CY",
	"NET.DO",
	"NET.EC",
	"NET.EG",
	"NET.GE",
	"NET.GG",
	"NET.GU",
	"NET.HK",
	"NET.ID",
	"NET.IL",
	"NET.IM",
	"NET.IN",
	"NET.JE",
	"NET.JO",
	"NET.JP",
	"NET.KH",
	"NET.LA",
	"NET.LB",
	"NET.LC",
	"NET.LV",
	"NET.LY",
	"NET.MM",
	"NET.MO",
	"NET.MT",
	"NET.MX",
	"NET.MY",
	"NET.NA",
	"NET.NC",
	"NET.NP",
	"NET.NZ",
	"NET.PA",
	"NET.PE",
	"NET.PH",
	"NET.PL",
	"NET.PY",
	"NET.RU",
	"NET.SG",
	"NET.SH",
	"NET.SY",
	"NET.TH",
	"NET.TN",
	"NET.TR",
	"NET.TW",
	"NET.UA",
	"NET.UK",
	"NET.UY",
	"NET.VE",
	"NET.VI",
	"NET.ZA",
	"NF",
	"NF.CA",
	"NG",
	"NGO.PH",
	"NGO.ZA",
	"NHS.UK",
	"NI",
	"NIC.IM",
	"NIC.IN",
	"NL",
	"NM.CN",
	"NM.KR",
	"NO",
	"NOM.CO",
	"NOM.VE",
	"NOM.ZA",
	"NP",
	"NR",
	"NS.CA",
	"NSK.SU",
	"NT.CA",
	"NU",
	"NUI.HU",
	"NX.CN",
	"NZ",
	"OM",
	"ON.CA",
	"OR.CR",
	"ORG",
	"ORG.AE",
	"ORG.AR",
	"ORG.AU",
	"ORG.AZ",
	"ORG.BB",
	"ORG.BM",
	"ORG.BR",
	"ORG.BS",
	"ORG.CN",
	"ORG.CO",
	"ORG.CU",
	"ORG.CY",
	"ORG.DO",
	"ORG.EC",
	"ORG.EG",
	"ORG.FJ",
	"ORG.GE",
	"ORG.GG",
	"ORG.GU",
	"ORG.HK",
	"ORG.HU",
	"ORG.IL",
	"ORG.IM",
	"ORG.JE",
	"ORG.JP",
	"ORG.KH",
	"ORG.LA",
	"ORG.LB",
	"ORG.LC",
	"ORG.LV",
	"ORG.LY",
	"ORG.MM",
	"ORG.MO",
	"ORG.MT",
	"ORG.MX",
	"ORG.MY",
	"ORG.NA",
	"ORG.NC",
	"ORG.NZ",
	"ORG.PA",
	"ORG.PE",
	"ORG.PH",
	"ORG.PL",
	"ORG.PY",
	"ORG.RU",
	"ORG.SG",
	"ORG.SH",
	"ORG.SY",
	"ORG.TN",
	"ORG.TR",
	"ORG.TW",
	"ORG.UK",
	"ORG.UY",
	"ORG.VE",
	"ORG.VI",
	"ORG.YU",
	"ORG.ZA",
	"OR.ID",
	"OR.KR",
	"OR.TH",
	"ORT.NP",
	"OR.UG",
	"OZ.AU",
	"PA",
	"PE",
	"PE.CA",
	"PF",
	"PG",
	"PH",
	"PK",
	"PL",
	"PLC.CO.IM",
	"PLC.UK",
	"PM",
	"PN",
	"POLICE.UK",
	"PR",
	"PRIV.HU",
	"PRO",
	"PSI.BR",
	"PT",
	"PVT.GE",
	"PW",
	"PY",
	"QA",
	"QC.CA",
	"QH.CN",
	"RE",
	"REC.BR",
	"REC.CO",
	"REC.VE",
	"RE.KR",
	"RES.IN",
	"RNRT.TN",
	"RNS.TN",
	"RNU.TN",
	"RO",
	"RU",
	"RW",
	"SA",
	"SA.CR",
	"SARK.GG",
	"SB",
	"SC",
	"SC.CN",
	"SCH.GG",
	"SCH.JE",
	"SCHOOL.FJ",
	"SCHOOL.ZA",
	"SCH.UK",
	"SCI.EG",
	"SD",
	"SE",
	"SG",
	"SH",
	"SH.CN",
	"SI",
	"SJ",
	"SK",
	"SK.CA",
	"SL",
	"SLD.PA",
	"SM",
	"SN",
	"SN.CN",
	"SO",
	"SR",
	"ST",
	"STORE.CO",
	"STORE.VE",
	"SU",
	"SV",
	"SX.CN",
	"SY",
	"SZ",
	"TC",
	"TD",
	"TEC.VE",
	"TELEMEMO.AU",
	"TF",
	"TG",
	"TH",
	"TJ",
	"TJ.CN",
	"TK",
	"TM",
	"TM.HU",
	"TMP.BR",
	"TM.ZA",
	"TN",
	"TO",
	"TOURISM.TN",
	"TP",
	"TR",
        "TRAVEL",
	"TT",
	"TV",
	"TW",
	"TW.CN",
	"TZ",
	"UA",
	"UG",
	"UK",
	"UM",
	"US",
	"UY",
	"UZ",
	"VA",
	"VC",
	"VE",
	"VG",
	"VI",
	"VN",
	"VU",
	"WEB.CO",
	"WEB.DO",
	"WEB.VE",
	"WEB.ZA",
	"WF",
	"WS",
	"XJ.CN",
	"XZ.CN",
	"YE",
	"YK.CA",
	"YN.CN",
	"YT",
	"YU",
	"ZA",
	"ZJ.CN",
	"ZM",
	"ZR",
	"ZW" };

	if ( ! s_isInitialized ) {
		// set up the hash table
		if ( ! s_table.set ( 8 , 0, sizeof(s_tlds)*2,NULL,0,false,0,
				     "tldtbl") ) 
			return log("build: Could not init table of TLDs.");
		// now add in all the stop words
		int32_t n = (int32_t)sizeof(s_tlds)/ sizeof(char *); 
		for ( int32_t i = 0 ; i < n ; i++ ) {
			char      *d    = s_tlds[i];
			int32_t       dlen = gbstrlen ( d );
			int64_t  dh   = hash64Lower_a ( d , dlen );
			if ( ! s_table.addKey (&dh,NULL) )
				return log("build: dom table failed");
		}
		s_isInitialized = true;
	} 
	int64_t h = hash64Lower_a ( tld , tldLen ); // gbstrlen(tld));
	return s_table.isInTable ( &h );//getScoreFromTermId ( h );
}		

void resetDomains ( ) {
	s_table.reset();
}
